%matplotlib inline
import os
import pickle
import numpy as np
import pandas as pd
from Bio import SeqIO
from kindel import kindel
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import seaborn as sns
import plotly.offline as py
import plotly.graph_objs as go
import plotly.figure_factory as ff
import cufflinks as cf
cf.go_offline()
from scipy.stats import wilcoxon
from scipy.stats import mannwhitneyu
reads_path = '/Users/Bede/Research/Datasets/phe_hcv_tf/reads_int/'
bam_path = '/Users/Bede/Research/Analyses/phe_hcv_tf_kindel/cns_mappings/'
bam_sub_path = '/Users/Bede/Research/Analyses/phe_hcv_tf_kindel/cns_mappings_sub/'
reads_fns = [fn for fn in os.listdir(reads_path) if fn.endswith('.gz')]
ids_reads = {}
for fn in reads_fns:
id = fn.replace('.R12.fastq.gz','').replace('HCV_AVU_AB_','')
ids_reads[id] = reads_path + fn
ids_reads
bams_fns = [fn for fn in os.listdir(bam_path) if fn.endswith('.bam')]
ids_bams = {}
for fn in bams_fns:
id = fn.replace('.bam','')
ids_bams[id] = bam_path + fn
ids_bams
bams_sub_fns = [fn for fn in os.listdir(bam_sub_path) if fn.endswith('.bam')]
ids_bams_sub = {}
for fn in bams_sub_fns:
id = fn.replace('.100k.bam','')
ids_bams_sub[id] = bam_sub_path + fn
ids_bams_sub
# for id, record in ids_records.items():
# SeqIO.write(record, '{}.cns.fa'.format(id), 'fasta')
# print(record.id)
On cluster
ids = ['1.1','1.2','1.3','1.4','1.5','10.1','10.2','11.1','11.2','12.1','12.2','13.1','13.2','14.1','14.2','2.1','2.2','2.3','2.4','3.1','3.2','3.3','3.4','3.5','4.1','4.2','5.1','5.2','6.1','6.2','7.1','7.2','8.1','8.2','8.3','9.1','9.2']
len(ids)
template = ('~/segemehl/segemehl_0_2_0/segemehl.x'
' -t 6 -A 60 -R 50 --maxout 1 -b'
' -d ~/data/phe_hcv_tf/all_cns_man/{id}.cns.fa'
' -x ~/data/phe_hcv_tf/all_cns_man/{id}.cns.idx'
' -q ~/data/phe_hcv_tf/reads_int/HCV_AVU_AB_{id}.R12.fastq.gz'
' | samtools view -b -'
' | samtools sort -m 4G -o ~/segemehl/phe_hcv_tf/a60_r50_top_cns_man/{id} -')
for id in ids:
print(template.format(id=id))
cmd = ('/opt/sci/bin/bbmap/reformat.sh'
' in={fn}'
' out={bam_sub_path}{id}.100k.bam'
' samplereadstarget=100000'
' mappedonly=t')
for id, fn in ids_bams.items():
os.system(cmd.format(id=id, fn=fn, bam_sub_path=bam_sub_path))
print(id)
kindel weights¶Run once on bam or subbed bams, else use load pickled weights dict
weights = {}
for id, fn in ids_bams_sub.items():
weights[id] = kindel.weights(fn, relative=True)
print(id)
with open('res/2017-07-04/weights', 'wb') as fh:
pickle.dump(weights, fh)
with open('res/2017-07-04/weights.p', 'rb') as fh:
weights = pickle.load(fh)
first_weights = {id: wt for id, wt in weights.items() if id.endswith('.1')}
last_ids = [id for id in weights.keys() if id.partition('.')[0] + '.' + str(int(id[-1])+1) not in weights.keys()]
last_weights = {id: wt for id, wt in weights.items() if id in last_ids}
https://www.ncbi.nlm.nih.gov/nuccore/NC_004102
Lengths not conformant to H77 coordinates?
# first_weights['1.1']['shannon']
shannons = pd.DataFrame({id: wts['shannon'] for id, wts in weights.items()})
first_shannons = pd.DataFrame({id: weights['shannon'] for id, weights in first_weights.items()})
last_shannons = pd.DataFrame({id: weights['shannon'] for id, weights in last_weights.items()})
first_consensuses = pd.DataFrame({id: weights['consensus'] for id, weights in first_weights.items()})
last_consensuses = pd.DataFrame({id: weights['consensus'] for id, weights in last_weights.items()})
pd.DataFrame(dict(first=first_consensuses.median(), last=last_consensuses.median())).plot.bar(logy=True)
pd.DataFrame(dict(first=first_consensuses.median(), last=last_consensuses.median())).plot.bar(logy=True)
pd.DataFrame(dict(first=first_shannons.median(axis=1), last=last_shannons.median(axis=1))).plot()
pd.DataFrame(dict(first=first_consensuses.median(axis=1), last=last_consensuses.median(axis=1))).iplot()
pd.DataFrame(dict(first=first_consensuses.median(axis=1), last=last_consensuses.median(axis=1))).iplot()
first_consensuses.median(axis=1).rolling(window=5, center=True).median()
pd.DataFrame(dict(first=first_shannons[50:9300].median(axis=1).rolling(window=25, center=True).median(), last=last_shannons[50:9300].median(axis=1).rolling(window=25, center=True).median())).iplot(colors=['#835AF1', '#F66095'])
pd.DataFrame(dict(first=first_consensuses[50:9300].median(axis=1).rolling(window=20, center=True).median(), last=last_consensuses[50:9300].median(axis=1).rolling(window=20, center=True).median())).iplot(colors=['#835AF1', '#F66095'])
pd.DataFrame(dict(first=first_consensuses.median(axis=1), last=last_consensuses.median(axis=1))).iplot(kind='histogram', barmode='stack', bins=100, histnorm='probability')
first_shannons.iplot()
first_consensuses.iplot()
hist_data = [first_shannons_flat, last_shannons_flat]
group_labels = ['Initial', 'Final']
colours = ['#F66095', '#2BCDC1']
# Create distplot with custom bin_size
fig = ff.create_distplot(hist_data, group_labels, colors=colours, bin_size=0.02, show_rug=False)
# Plot!
py.iplot(fig, filename='Distplot with Multiple Datasets')
# Group data together
hist_data = [first_shannons_flat, last_shannons_flat]
group_labels = ['Initial', 'Final']
colours = ['#F66095', '#2BCDC1']
# Create distplot with custom bin_size
fig = ff.create_distplot(hist_data, group_labels, colors=colours, bin_size=0.02, show_rug=False)
# Plot!
py.iplot(fig, filename='Distplot with Multiple Datasets')
sns.distplot(weights['1.1'].shannon)
first_shannons_flat = first_shannons.replace([np.inf, -np.inf], 0).dropna().values.flatten()
last_shannons_flat = last_shannons.replace([np.inf, -np.inf], 0).dropna().values.flatten()
first_consensuses_flat = first_consensuses.replace([np.inf, -np.inf], 0).dropna().values.flatten()
last_consensuses_flat = last_consensuses.replace([np.inf, -np.inf], 0).dropna().values.flatten()
sns.distplot(last_shannons.replace([np.inf, -np.inf], np.nan).dropna().values.flatten())
trace1 = go.Histogram(
x=first_shannons_flat,
nbinsx=150,
opacity=0.33,
marker=dict(color='#ff9f1c'),
name='Initial timepoints',
histnorm='probability'
)
trace2 = go.Histogram(
x=last_shannons_flat,
nbinsx=150,
opacity=0.33,
marker=dict(color='#2ec4b6'),
name='Final timepoints',
histnorm='probability'
)
data = [trace1, trace2]
layout = go.Layout(
barmode='overlay',
title='Per position median entropy',
xaxis=dict(range=[0, 1]))
fig = go.Figure(data=data, layout=layout)
py.iplot(fig)
np.median(first_shannons_flat)
np.median(last_shannons_flat)
trace1 = go.Histogram(
x=first_consensuses_flat,
nbinsx=750,
opacity=0.33,
marker=dict(color='#ff9f1c'),
name='Initial timepoints',
histnorm='probability'
)
trace2 = go.Histogram(
x=last_consensuses_flat,
nbinsx=750,
opacity=0.33,
marker=dict(color='#2ec4b6'),
name='Final timepoints',
histnorm='probability'
)
data = [trace1, trace2]
layout = go.Layout(
barmode='overlay',
title='Per position median consensus support',
xaxis=dict(range=[0.9, 1]))
fig = go.Figure(data=data, layout=layout)
py.iplot(fig)
np.median(first_consensuses_flat)
np.median(last_consensuses_flat)
shannons.median()
bam_path = '/Users/Bede/Desktop/bams/'
bam_sub_path = '/Users/Bede/Desktop/bams_100k/'
bams_fns = [fn for fn in os.listdir(bam_path) if fn.endswith('.bam')]
ids_bams = {}
for fn in bams_fns:
id = fn.replace('.bam','')
ids_bams[id] = bam_path + fn
ids_bams
cmd = ('/opt/sci/bin/bbmap/reformat.sh'
' in={fn}'
' out={bam_sub_path}{id}.100k.bam'
' samplereadstarget=100000'
' mappedonly=t')
for id, fn in ids_bams.items():
os.system(cmd.format(id=id, fn=fn, bam_sub_path=bam_sub_path))
print(id)
plot-clips¶ids_bams_sub = {}
for fn in [fn for fn in os.listdir(bam_sub_path) if fn.endswith('.bam')]:
id = fn.replace('.100k.bam','')
ids_bams_sub[id] = bam_sub_path + fn
ids_bams_sub
for id, fn in ids_bams_sub.items():
kindel.plotly_clips(fn)
print(id)
weights¶weights = {}
for id, fn in ids_bams_sub.items():
weights[id] = kindel.weights(fn, relative=True)
print(id)
with open('res/2017-07-04/weights_pp', 'wb') as fh:
pickle.dump(weights, fh)
with open('res/2017-07-04/weights_pp.p', 'rb') as fh:
weights = pickle.load(fh)
first_weights = {id: wt for id, wt in weights.items() if id.endswith('.1')}
last_ids = [id for id in weights.keys() if id.partition('.')[0] + '.' + str(int(id[-1])+1) not in weights.keys()]
last_weights = {id: wt for id, wt in weights.items() if id in last_ids}
shannons = pd.DataFrame({id: wts['shannon'] for id, wts in weights.items()})
first_shannons = pd.DataFrame({id: weights['shannon'] for id, weights in first_weights.items()}).replace([np.inf, -np.inf], np.nan).replace(np.nan, 0)
last_shannons = pd.DataFrame({id: weights['shannon'] for id, weights in last_weights.items()}).replace([np.inf, -np.inf], np.nan).replace(np.nan, 0)
first_consensuses = pd.DataFrame({id: weights['consensus'] for id, weights in first_weights.items()}).replace([np.inf, -np.inf], np.nan).replace(np.nan, 0)
last_consensuses = pd.DataFrame({id: weights['consensus'] for id, weights in last_weights.items()}).replace([np.inf, -np.inf], np.nan).replace(np.nan, 0)
first_shannons_flat = first_shannons.replace([np.inf, -np.inf], np.nan).dropna().values.flatten()
last_shannons_flat = last_shannons.replace([np.inf, -np.inf], np.nan).dropna().values.flatten()
first_consensuses_flat = first_consensuses.replace([np.inf, -np.inf], np.nan).dropna().values.flatten()
last_consensuses_flat = last_consensuses.replace([np.inf, -np.inf], np.nan).dropna().values.flatten()
len(first_shannons_flat)
Using coords from https://www.ncbi.nlm.nih.gov/nuccore/NC_004102
mannwhitneyu(x=first_shannons.values.flatten(), y=last_shannons.values.flatten())
mannwhitneyu(x=first_shannons.values.flatten(), y=last_shannons.values.flatten())
first_shannons[915:1490].values.flatten()
mannwhitneyu(x=first_shannons[915:1490].values.flatten(), y=last_shannons[915:1490].values.flatten())
wilcoxon(x=first_shannons[915:1490].values.flatten(), y=last_shannons[915:1490].values.flatten())
e1_fs_d = dict(first_shannons[915:1490].median())
e1_fs_d_norm = {k.partition('.')[0]: v for k, v in e1_fs_d.items()}
e1_fs_d_norm_df = pd.DataFrame.from_dict(e1_fs_d_norm, orient='index')
e1_fs_d_norm_df.columns = ['first']
e1_ls_d = dict(last_shannons[915:1490].median())
e1_ls_d_norm = {k.partition('.')[0]: v for k, v in e1_ls_d.items()}
e1_ls_d_norm_df = pd.DataFrame.from_dict(e1_ls_d_norm, orient='index')
e1_ls_d_norm_df.columns = ['last']
e1_fls_df = pd.concat([e1_fs_d_norm_df, e1_ls_d_norm_df], axis=1)
e1_fls_df
wilcoxon(x=e1_fls_df['first'].values, y=e1_fls_df['last'].values)
last_shannons[915:1490].median()
weights['14.1'].shannon.plot()
dict(shannons.median())
pd.DataFrame(dict(first=first_shannons.median(axis=1), last=last_shannons.median(axis=1))).plot()
# Group data together
hist_data = [first_shannons_flat, last_shannons_flat]
group_labels = ['Initial', 'Final']
colours = ['#F66095', '#2BCDC1']
# Create distplot with custom bin_size
fig = ff.create_distplot(hist_data, group_labels, colors=colours, bin_size=0.02, show_rug=False)
# Plot!
py.iplot(fig, filename='Distplot with Multiple Datasets')
pd.DataFrame(dict(first=first_consensuses.median(axis=1), last=last_consensuses.median(axis=1))).iplot(kind='histogram', barmode='stack', bins=100, histnorm='probability')
pd.DataFrame(dict(first=first_consensuses.median(axis=1), last=last_consensuses.median(axis=1))).iplot()
pd.DataFrame(dict(first=first_shannons.median(axis=1).rolling(window=25, center=True).median(), last=last_shannons.median(axis=1).rolling(window=25, center=True).median())).iplot(colors=['#835AF1', '#F66095'])
type(first_shannons_flat)
last_shannons.median(axis=1).rolling(window=25, center=True).mean()
first_shannons.rolling(window=25, center=True).median()['1.1'].plot()
first_shannons.rolling(window=25, center=True).mean()['1.1'].plot()
pd.DataFrame(dict(first=first_shannons.median(axis=1), last=last_shannons.median(axis=1))).iplot(colors=['#835AF1', '#F66095'])
pd.DataFrame(dict(initial_timepoints_entropy=first_shannons[:9025].median(axis=1).rolling(window=66, center=True).mean(), final_timepoints_entropy=last_shannons[:9025].median(axis=1).rolling(window=75, center=True).mean())).iplot(colors=['#835AF1', '#F66095'], legend=dict(x=0.78,y=1, bgcolor='rgba(0,0,0,0)'))
pd.DataFrame(dict(initial_timepoints_consensus=first_consensuses.median(axis=1)[:9025].rolling(window=66, center=True).mean(), final_timepoints_consensus=last_consensuses[:9025].median(axis=1).rolling(window=66, center=True).mean())).iplot(colors=['#3D9970', '#FF851B'], legend=dict(x=0.76,y=0, bgcolor='rgba(0,0,0,0)'))
matplotlib.style.use('seaborn')
pd.DataFrame(dict(initial_timepoints_entropy=first_shannons[:9025].median(axis=1).rolling(window=66, center=True).mean(), final_timepoints_entropy=last_shannons[:9025].median(axis=1).rolling(window=75, center=True).mean())).plot()
matplotlib.style.available
def sinplot(flip=1):
x = np.linspace(0, 14, 100)
for i in range(1, 7):
plt.plot(x, np.sin(x + i * .5) * (7 - i) * flip)
sns.set_style("white")
sns.set_style("ticks")
sinplot()
sns.despine()
fig, ax = plt.subplots(figsize=(10, 2))
pd.DataFrame(dict(initial_timepoints_entropy=first_shannons[:9025].median(axis=1).rolling(window=66, center=True).mean(), final_timepoints_entropy=last_shannons[:9025].median(axis=1).rolling(window=75, center=True).mean())).plot(ax=ax)
fig, ax = plt.subplots(figsize=(10, 2))
pd.DataFrame(dict(initial_timepoints_consensus=first_consensuses.median(axis=1)[:9025].rolling(window=66, center=True).mean(), final_timepoints_consensus=last_consensuses[:9025].median(axis=1).rolling(window=66, center=True).mean())).plot(ax=ax)
f, (ax1, ax2, ax3) = plt.subplots(3, figsize=(10, 6), sharex=True)
# sns.set_palette(sns.hls_palette(2, l=.6, s=.6))
sns.set_palette(sns.xkcd_palette(["windows blue", "dark mint"]))
pd.DataFrame(dict(initial_timepoints_entropy=first_shannons[:9025].median(axis=1).rolling(window=66, center=True).mean(), final_timepoints_entropy=last_shannons[:9025].median(axis=1).rolling(window=75, center=True).mean())).plot(ax=ax1)
sns.set_palette(sns.xkcd_palette(["windows blue", "dark mint"]))
pd.DataFrame(dict(initial_timepoints_consensus=first_consensuses.median(axis=1)[:9025].rolling(window=66, center=True).mean(), final_timepoints_consensus=last_consensuses[:9025].median(axis=1).rolling(window=66, center=True).mean())).plot(ax=ax2)
def patch(start, end, colour):
return patches.Rectangle((start, 0), end, 1, facecolor=colour)
coords_aa = [1, 192, 384, 747, 810, 1027, 1658, 1712, 1973, 2421, 3012]
coords_nt = [pos*3 for pos in coords_aa]
coords_fmt = []
for i, pos in enumerate(coords_nt):
if i == 0:
coords_fmt.append((0, pos))
else:
coords_fmt.append((coords_nt[i-1], pos))
genes = [patch(s, e, col) for (s, e), col in zip(coords_fmt, palette)]
for gene in genes:
ax3.add_patch(gene)
ax1.set_ylabel('Median entropy')
ax2.set_ylabel('Median consensus support')
ax3.set_xlabel('HCV polyprotein position (nt)')
ax3.set(yticks=[])
ax3.annotate('C', xy=(0, 0), xytext=(220, 0.5))
ax3.annotate('E1', xy=(0, 0), xytext=(794, 0.5))
ax3.annotate('E2', xy=(0, 0), xytext=(1626, 0.5))
ax3.annotate('P7', xy=(0, 0), xytext=(2250, 0.5))
ax3.annotate('NS2', xy=(0, 0), xytext=(2600, 0.5))
ax3.annotate('NS3', xy=(0, 0), xytext=(3820, 0.5))
ax3.annotate('NS4A', xy=(0, 0), xytext=(4800, 0.5))
ax3.annotate('NS4B', xy=(0, 0), xytext=(5330, 0.5))
ax3.annotate('NS5A', xy=(0, 0), xytext=(6400, 0.5))
ax3.annotate('NS5B', xy=(0, 0), xytext=(7900, 0.5))
sns.set_style('white')
plt.savefig("first_last.svg")
ax3.se
ax3.annotate('C', xy=(1, 1),xycoords='data',
xytext=(0.1, 0.5), textcoords='axes fraction',
arrowprops=dict(facecolor='black', shrink=0.05),
horizontalalignment='right', verticalalignment='top',
)
coords_fmt
[int(pos[0]-70+(pos[1]-pos[0])/2) for pos in coords_fmt]
sns.palplot(sns.hls_palette(6, l=.6, s=.6))
sns.palplot(sns.hls_palette(6, l=.6, s=.6))
sns.palplot(sns.hls_palette(4, l=.6, s=.6))
sns.palplot(list(reversed(sns.hls_palette(2, l=.6, s=.6))))
sns.palplot(sns.hls_palette(4, l=.6, s=.6))
coords_fmt